{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'xgboost'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-1-503653c6ce02>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      5\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmodel_selection\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mKFold\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      6\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmetrics\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mmean_absolute_error\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 7\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mxgboost\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mXGBRegressor\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      8\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      9\u001b[0m \u001b[0mtrain\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'../data/used_car_train_20200313.csv'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msep\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m' '\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'xgboost'"
     ]
    }
   ],
   "source": [
    "# MAE: 528.190\n",
    "# XGBoostRegressor\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import KFold\n",
    "from sklearn.metrics import mean_absolute_error\n",
    "from xgboost import XGBRegressor\n",
    "\n",
    "train = pd.read_csv('../data/used_car_train_20200313.csv', sep=' ')\n",
    "test = pd.read_csv('../data/used_car_testB_20200421.csv', sep=' ')\n",
    "\n",
    "# 合并训练数据和测试数据集\n",
    "all_data = pd.concat([train, test], ignore_index=True)\n",
    "\n",
    "# 对 price 做对数变换\n",
    "all_data['price'] = np.log1p(all_data['price'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 处理异常值，如功率大于 600 的值\n",
    "all_data['power'] = all_data['power'].apply(lambda x: 600 if x > 600 else x)\n",
    "\n",
    "# 处理日期相关信息\n",
    "all_data['reg_year'] = all_data['regDate'].apply(lambda x: int(str(x)[:4]))\n",
    "all_data['reg_month'] = all_data['regDate'].apply(lambda x: int(str(x)[4:6]))\n",
    "all_data['reg_day'] = all_data['regDate'].apply(lambda x: int(str(x)[6:]))\n",
    "all_data['creat_year'] = all_data['creatDate'].apply(lambda x: int(str(x)[:4]))\n",
    "all_data['creat_month'] = all_data['creatDate'].apply(lambda x: int(str(x)[4:6]))\n",
    "all_data['creat_day'] = all_data['creatDate'].apply(lambda x: int(str(x)[6:]))\n",
    "\n",
    "# 标记汽车没有经过维修\n",
    "all_data['notRepairedDamage'] = all_data['notRepairedDamage'].apply(lambda x: 0 if x == '-' else 1)\n",
    "\n",
    "# 对可分类的连续特征进行分桶，如将功率（power）分成10个分桶，并提取新特征\n",
    "all_data['power_bucket'] = pd.cut(all_data['power'], 10, labels=False)\n",
    "new_cols = ['power_bucket', 'v_0', 'v_3', 'v_8', 'v_12']\n",
    "for col1 in new_cols:\n",
    "    for col2 in new_cols:\n",
    "        if col1 != col2:\n",
    "            all_data['{}_{}_sum'.format(col1, col2)] = all_data[col1] + all_data[col2]\n",
    "            all_data['{}_{}_diff'.format(col1, col2)] = all_data[col1] - all_data[col2]\n",
    "\n",
    "# 处理缺失值\n",
    "all_data['fuelType'] = all_data['fuelType'].fillna(0)\n",
    "all_data['gearbox'] = all_data['gearbox'].fillna(0)\n",
    "all_data['bodyType'] = all_data['bodyType'].fillna(0)\n",
    "all_data['model'] = all_data['model'].fillna(0)\n",
    "\n",
    "# 分离特征和标签\n",
    "train_data = all_data[~all_data['price'].isnull()]\n",
    "test_data = all_data[all_data['price'].isnull()]\n",
    "X_train = train_data.drop(['SaleID', 'name', 'regDate', 'creatDate', 'price'], axis=1)\n",
    "X_test = test_data.drop(['SaleID', 'name', 'regDate', 'creatDate', 'price'], axis=1)\n",
    "y_train = train_data['price']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 定义模型参数\n",
    "xgb_model = XGBRegressor(\n",
    "    max_depth=10,\n",
    "    learning_rate=0.05,\n",
    "    n_estimators=1000,\n",
    "    gamma=0.005,\n",
    "    subsample=0.9,\n",
    "    colsample_bytree=0.7,\n",
    "    objective='reg:squarederror',\n",
    "    n_jobs=-1,\n",
    "    random_state=2021,\n",
    "    eval_metric='mae'\n",
    ")\n",
    "\n",
    "# 交叉验证以及训练模型\n",
    "skf = KFold(n_splits=5, shuffle=True, random_state=2021)\n",
    "oof = np.zeros(len(X_train))\n",
    "test_predict = np.zeros(len(X_test))\n",
    "for i, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):\n",
    "    print(\"Training on Fold {}\".format(i+1))\n",
    "    tr_x, tr_y = X_train.iloc[train_index], y_train.iloc[train_index]\n",
    "    vl_x, vl_y = X_train.iloc[val_index], y_train.iloc[val_index]\n",
    "    xgb_model.fit(\n",
    "        tr_x, tr_y,\n",
    "        eval_set=[(vl_x, vl_y)],\n",
    "        early_stopping_rounds=100,\n",
    "        verbose=200\n",
    "    )\n",
    "\n",
    "    oof[val_index] = xgb_model.predict(vl_x)\n",
    "    test_predict += xgb_model.predict(X_test) / skf.n_splits\n",
    "\n",
    "mae = mean_absolute_error(np.expm1(y_train), np.expm1(oof))\n",
    "print(\"MAE: {:.3f}\".format(mae))\n",
    "\n",
    "# 保存预测结果\n",
    "submission = pd.DataFrame({'SaleID': test_data['SaleID'], 'price': np.expm1(test_predict)})\n",
    "submission.to_csv('xgb_submission.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
